scBALF Covid-19 data analysis with Monocle

Setting up monocle model by using the normal distribution. (exexpressionFamily=uninormal(), default is the negativeBinomial)

knitr::opts_chunk$set(cache = TRUE)
library(monocle)
## Loading required package: Matrix
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: ggplot2
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
## Loading required package: DDRTree
## Loading required package: irlba
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data<-read.csv(file = "/Users/Nami/Documents/hackathon/covid-selected-data.csv")
data<-as.matrix(data)
rownames(data)<-data[,1]
data<-data[,-1]
n<-rownames(data)
data<-apply(data,2,as.numeric)
rownames(data)<-n
data<-t(data)
dim(data)
## [1]  1999 23189
true_lables<-read.csv(file="/Users/Nami/Documents/hackathon/covid-selected-data-labels.csv")
true_lables<-as.matrix(true_lables)
rownames(true_lables)<-true_lables[,1]
true_lables<-true_lables[,-1]
n<-rownames(data)
rownames(data)<-n
true_lables<-as.matrix(true_lables)
colnames(true_lables)<-c("cell_type")
true_lables<-as.data.frame(true_lables)
summary(as.factor(true_lables$cell_type))
##   Mild Normal Severe 
##   3292  11978   7919
true_lables<-true_lables %>% mutate(Nor_vs_rest = case_when(
  cell_type == c("Normal") ~ "Normal",
  cell_type != c("Normal") ~ "Rest"))

true_lables<-true_lables %>% mutate(Sev_vs_rest = case_when(
  cell_type == c("Severe") ~ "Severe",
  cell_type != c("Severe") ~ "Rest"))

features<-matrix(0,nrow(data),1)
colnames(features)<-c("gene_short_name")
rownames(features)<-rownames(data)
features[,1]<-rownames(data)
features<-as.data.frame(features)
pd <- new("AnnotatedDataFrame", data = true_lables)
fd <- new("AnnotatedDataFrame", data = features)
covid <- newCellDataSet(as.matrix(data),
                       phenoData = pd, featureData = fd,expressionFamily=uninormal())

Dimension Reduction using tSNE and cell clustering (densityPeak)

knitr::opts_chunk$set(cache = TRUE)
covid <- reduceDimension(covid, max_components = 2, num_dim = 50,
                         norm_method = 'none',
                        reduction_method = 'tSNE',
                        residualModelFormulaStr = "~cell_type",
                        verbose = F)
covid <- clusterCells(covid, num_clusters = 4)
## Distance cutoff calculated to 3.921775
plot_cell_clusters(covid, 1, 2, color = "cell_type")

plot_cell_clusters(covid, 1, 2, color = "Cluster")

plot_cell_clusters(covid, 1, 2, color = "Cluster") +
  facet_wrap(~cell_type)

Dimension Reduction using tSNE and cell clustering(densityPeak)

This time used 200 principal components instead of 50 in the initial PCA to see whether results change or not. Also removed adding pseudo expression and relative expression options.

covid <- reduceDimension(covid, max_components = 2, num_dim = 200,
                         norm_method = 'none',
                        reduction_method = 'tSNE',
                        pseudo_expr = 0,
                        relative_expr = FALSE,
                        residualModelFormulaStr = "~cell_type",
                        verbose = F)
covid <- clusterCells(covid, num_clusters = 4)
## Distance cutoff calculated to 4.707604
plot_cell_clusters(covid, 1, 2, color = "cell_type")

plot_cell_clusters(covid, 1, 2, color = "Cluster")

plot_cell_clusters(covid, 1, 2, color = "Cluster") +
  facet_wrap(~cell_type)

Dimension Reduction using tSNE and cell clustering(Louvain with K=1000 in knn)

covid <- reduceDimension(covid, max_components = 2, num_dim = 50,
                         norm_method = 'none',
                        reduction_method = 'tSNE',
                        pseudo_expr = 0,
                        residualModelFormulaStr = "~cell_type",
                        relative_expr = FALSE,
                        verbose = F)
covid <- clusterCells(covid, method = 'louvain',k=1000)
plot_cell_clusters(covid, 1, 2, color = "cell_type")

plot_cell_clusters(covid, 1, 2, color = "Cluster")

plot_cell_clusters(covid, 1, 2, color = "Cluster") +
  facet_wrap(~cell_type)

Differential Expression Analysis between Normal cells and the rest (Mild + Severe)

marker_genes <- row.names(subset(fData(covid),
                                 gene_short_name %in% gene_short_name))
diff_test_res_Nor <- differentialGeneTest(covid[marker_genes,],
                                      fullModelFormulaStr = "~Nor_vs_rest")
## Warning in if (isSparseMatrix(exprs(X))) {: the condition has length > 1 and
## only the first element will be used
Ordered_diff_test_res_Nor<- diff_test_res_Nor[order(diff_test_res_Nor$qval),]
head(Ordered_diff_test_res_Nor,20)
##            status    family          pval          qval gene_short_name
## SFTA2          OK uninormal  0.000000e+00  0.000000e+00           SFTA2
## RSAD2          OK uninormal  0.000000e+00  0.000000e+00           RSAD2
## IL21           OK uninormal  0.000000e+00  0.000000e+00            IL21
## PEG10          OK uninormal  0.000000e+00  0.000000e+00           PEG10
## IGHV3.69.1     OK uninormal  0.000000e+00  0.000000e+00      IGHV3.69.1
## SPON1          OK uninormal  0.000000e+00  0.000000e+00           SPON1
## IFNL2          OK uninormal  0.000000e+00  0.000000e+00           IFNL2
## SLAMF7         OK uninormal 1.449616e-314 3.622228e-312          SLAMF7
## ADGRF5         OK uninormal 1.859501e-303 4.130158e-301          ADGRF5
## CCL8           OK uninormal 2.451960e-299 4.901469e-297            CCL8
## CALHM6         OK uninormal 2.524630e-296 4.587941e-294          CALHM6
## RGL1           OK uninormal 1.053695e-294 1.755280e-292            RGL1
## CD52           OK uninormal 3.650429e-292 5.613237e-290            CD52
## CXCL10         OK uninormal 1.639982e-280 2.341661e-278          CXCL10
## IL4I1          OK uninormal 9.500626e-234 1.266117e-231           IL4I1
## IGFL2          OK uninormal 7.227738e-230 9.030155e-228           IGFL2
## IDO1           OK uninormal 1.018266e-228 1.197361e-226            IDO1
## GCH1           OK uninormal 3.012913e-210 3.346008e-208            GCH1
## CXCL11         OK uninormal 1.412945e-209 1.486566e-207          CXCL11
## SFTPB          OK uninormal 2.271687e-209 2.270551e-207           SFTPB

Differential Expression Analysis between Severe cells and the rest (Mild + Normal)

diff_test_res_Sev <- differentialGeneTest(covid[marker_genes,],
                                      fullModelFormulaStr = "~Sev_vs_rest")
## Warning in if (isSparseMatrix(exprs(X))) {: the condition has length > 1 and
## only the first element will be used
Ordered_diff_test_res_Sev<- diff_test_res_Sev[order(diff_test_res_Sev$qval),]
head(Ordered_diff_test_res_Sev,20)
##            status    family          pval          qval gene_short_name
## SFTA2          OK uninormal  0.000000e+00  0.000000e+00           SFTA2
## IL21           OK uninormal  0.000000e+00  0.000000e+00            IL21
## PEG10          OK uninormal  0.000000e+00  0.000000e+00           PEG10
## SLAMF7         OK uninormal  0.000000e+00  0.000000e+00          SLAMF7
## IGHV3.69.1     OK uninormal  0.000000e+00  0.000000e+00      IGHV3.69.1
## SPON1          OK uninormal  0.000000e+00  0.000000e+00           SPON1
## IFNL2          OK uninormal  0.000000e+00  0.000000e+00           IFNL2
## RGL1           OK uninormal 4.481175e-321 1.119736e-318            RGL1
## ADGRF5         OK uninormal 1.429792e-310 3.175728e-308          ADGRF5
## CD52           OK uninormal 9.357917e-306 1.870648e-303            CD52
## CRYBA4         OK uninormal 5.007260e-296 9.099557e-294          CRYBA4
## CALHM6         OK uninormal 3.441490e-295 5.732949e-293          CALHM6
## CTSB           OK uninormal 1.118941e-289 1.720586e-287            CTSB
## LGMN           OK uninormal 5.206730e-284 7.434467e-282            LGMN
## CCL8           OK uninormal 5.009365e-275 6.675813e-273            CCL8
## IGFL2          OK uninormal 3.977379e-256 4.969238e-254           IGFL2
## SDS            OK uninormal 2.512973e-236 2.954961e-234             SDS
## RSAD2          OK uninormal 2.981957e-233 3.311628e-231           RSAD2
## SFTPB          OK uninormal 8.618039e-230 9.067084e-228           SFTPB
## HKDC1          OK uninormal 5.975868e-228 5.972880e-226           HKDC1

Intersection of top 100 DE genes between two analysis:

intersect(Ordered_diff_test_res_Nor$gene_short_name[1:100],Ordered_diff_test_res_Sev$gene_short_name[1:100])
##  [1] "SFTA2"      "RSAD2"      "IL21"       "PEG10"      "IGHV3.69.1"
##  [6] "SPON1"      "IFNL2"      "SLAMF7"     "ADGRF5"     "CCL8"      
## [11] "CALHM6"     "RGL1"       "CD52"       "CXCL10"     "IL4I1"     
## [16] "IGFL2"      "IDO1"       "GCH1"       "CXCL11"     "SFTPB"     
## [21] "FABP4"      "SFTA1P"     "SGK1"       "GJA1"       "HKDC1"     
## [26] "CRYBA4"     "GBP5"       "MARCKS"     "CD3E"       "ANKRD22"   
## [31] "CTSE"       "SFN"        "CCL3"       "S100A14"    "PFKFB3"    
## [36] "SDS"        "LGMN"       "KRT7"       "CYR61"      "IFIT1"     
## [41] "CTSB"       "CTSL"       "SAA2.SAA4"  "ATF5"       "IFNL3"     
## [46] "GBP1"       "GBP4"       "TRAV38.1"   "CCL2"       "TSHZ2"     
## [51] "CCL4"       "NINJ1"      "CD38"       "MAL2"       "ALOX5AP"   
## [56] "CD7"        "SPRR1B"     "MAFB"       "TRBV7.4"    "ATF3"      
## [61] "RNASE7"     "CXCL9"      "GZMA"       "FBXW10"     "KRT16"     
## [66] "IGFBP4"     "FGL2"       "KRTAP3.1"   "CLDN3"      "LAMC2"     
## [71] "PLA2G7"     "MARCO"      "IER3"       "NUPR1"      "CD2"       
## [76] "PTPRCAP"    "SPHK1"      "ICAM1"      "TMPRSS11D"  "CD247"     
## [81] "IGHV5.51"   "SPOCK2"     "CEACAM7"    "CD48"       "SFTPA2"    
## [86] "CCR4"       "SPP1"       "LYZ"